from platform import win32_edition
import numpy as np
import cupy as cp
import scipy
import os
from os.path import exists, isfile, join
from pathlib import Path
import sys
import shutil
import gc
import math
import gensim
from gensim.models.coherencemodel import CoherenceModel
from gensim.corpora import Dictionary
import json


# Import stopwords
import nltk
from nltk import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.util import everygrams

# Import TensorLy
import tensorly as tl
import cudf
from cudf import Series
from cuml.feature_extraction.text import CountVectorizer
from cuml.preprocessing.text.stem import PorterStemmer
import cupyx 

#Insert Plotly
import pandas as pd
import time
import pickle

# Import utility functions from other files
#from version0_99.tlda_wrapper import TLDA
import version0_99.file_operations as fop



# Constants

ROOT_DIR        = "/media/large_storage/PA-Replication/Covid/data/"
INDIR           = "split_files/"
RAW_DATA_PREFIX = "split_files/"


# Output Relative paths -- do not change
X_MAT_FILEPATH_PREFIX = "x_mat/"
X_FILEPATH = "X_full.obj"
X_DF_FILEPATH = "X_df.obj"
X_LST_FILEPATH = "X_lst.obj"
CORPUS_FILEPATH_PREFIX = "corpus/"
GENSIM_CORPUS_FILEPATH = "corpus.obj"
COUNTVECTOR_FILEPATH = "countvec.obj"
TLDA_FILEPATH = "tlda.obj"
VOCAB_FILEPATH = "vocab.csv"
EXISTING_VOCAB_FILEPATH = "vocab.obj"
TOPIC_FILEPATH_PREFIX   = 'predicted_topics/'
DOCUMENT_TOPIC_FILEPATH = 'dtm.csv'
COHERENCE_FILEPATH = 'coherence.obj'
DOCUMENT_TOPIC_FILEPATH_TOT = 'dtm_df.csv'
OUT_ID_DATA_PREFIX = 'ids/' 
TOP_WORDS_FILEPATH ='top_words.csv'

# Device settings
backend="cupy"
tl.set_backend(backend)
device = 'cuda'
porter = PorterStemmer()


def basic_clean(df):
    df['tweet'] = df['tweet'].astype('str')
    df = df.drop_duplicates(keep="first")
    df['tweet'] = df['tweet'].str.lower()
    df['tweet'] = df['tweet'].str.replace(r'[^\w\s]+', '')
    return df



def partial_fit(self , data):
    if(hasattr(self , 'vocabulary_')):
        vocab = self.vocabulary_ # series
    else:
        vocab = Series()
    self.fit(data)
    vocab = cudf.concat([vocab,self.vocabulary_])
    self.vocabulary_ = vocab.unique()

def tune_filesplit_size_on_IPCA_batch_size(IPCA_batchsize):
    return None


# declare the stop words 
stop_words = (stopwords.words('english'))
added_words = ["thread","say","will","has","by","for","hi","hey","hah","thank","watch","doe",
               "said","talk","congrats","congratulations","are","as","i", "time","abus","year","mani",
               "me", "my", "myself", "we", "our", "ours", "ourselves", "use","look","movement","assault",
               "you", "your", "yours","he","her","him","she","hers","that","harass","whi","feel","say","gt",
               "be","with","their","they're","is","was","been","not","they","way","thi","rt","i","we","and",
               "to","for","do","go",
               "it","have",  "one","think",   "thing","bring","put","well","take","exactli","tell",
               "good","day","work", "latest","today","becaus","peopl","via","see","old","ani","covid-19","-",
               "call", "wouldnt","wow", "learned","hi","-","", "things" ,"thing","can't","can","right","got","show",
               "cant","will","go","going","let","would","could","him","his","think","thi","ha","onli","back",
               "lets","let's","say","says","know","talk","talked","talks","dont","think","watch","right",
               "said","something","this","was","has","had","abc","rt","ha","haha","hat","even","happen",
               "something","wont","people","make","want","went","goes","people","had","also","ye","still","must",
               "person","like","come","from","yet","able","wa","yah","yeh","yeah","onli","ask","give","read",
               "need", "men", "women", "get", "man", "amp","amp&","yr","yrs","&amp;","amp",
               "shirt", "vs","iâ€™m","|",]

# set stop words and countvectorizer method
stop_words= list(np.append(stop_words,added_words))
CountVectorizer.partial_fit = partial_fit

# define function with no preprocessing
def custom_preprocessor(doc):
    return doc



# set you text pre-processing params 

# make final directories for outputs
#save_dir = os.path.join(ROOT_DIR, curr_dir)
#if not os.path.exists(save_dir):
#    os.makedirs(save_dir)



countvec = CountVectorizer( stop_words = stop_words, #stop_words, # works
                            lowercase = True,#True, # works
                            ngram_range = (1, 2), #(1,2), ## allow for bigrams
                            preprocessor = custom_preprocessor,
                            max_df = 500000, #100000, # limit this to 10,000 ## 500000 for 8M
                            min_df = 200)# 2000) ## limit this to 20 ## 2500 for 8M

#eigenvec_str = "_n_eigenvec_" + (str(n_eigenvec) if n_eigenvec is not None else "None")

#exp_save_dir = os.path.join(save_dir, "num_tops_" + str(num_tops) + "_alpha0_" + str(alpha_0) + "_learning_rate_" + str(learning_rate) + "_theta_" + str(theta_param) + "_orthogonality_" + str(ortho_loss_param) + "_initialize_first_docs_" + str(initialize_first_docs) + eigenvec_str + "/")
#if not os.path.exists(exp_save_dir):
#    os.makedirs(exp_save_dir)



# DEFAULT PARAMS
batch_size_pca  = 220000  # this will handle 8000 words + 100 topics ad infinite number of documents 
batch_size_grad = 12500 # 1% of data size - see what coherence looks like - can also try increasing  #divide data by 1,000 ## 800 = -3322.32 (6000 seecond) 4000=-3320 (1800 seconds) 8000=-3325 (1180 seconds)  Lower this to 1% of TOTAL data size
smoothing   = 1e-7
n_iter_train = 200
n_iter_test = 10

max_df = 0.2
min_df = 0.0005

#SET SEED
seed = 57

# Program controls
vocab_build    = 0
#save_files     = first_run
stgd           = 1
split_files    = 0
compute_mean   = 1
recover_top_words = 1
transform_data    = 1
create_meta_df    = 1
coherence         = 1

# Other globals
num_data_rows = 0
# max_data_rows = 1.2e6

#Start

print("\n\nSTART...")

"""
We want 
1. Do gridsearch over topics, alpha, (LDA parameters, learning rate, theta,ortho loss param ) 
a. Gensim coherence measures: 'u_mass', 'c_v', 'c_uci', 'c_npmi', "perplexity" 
danny will figure out perplexity(scatter plot of perplexity vs. Coherence)
b. Report Word Clouds
c. Document-topics inference
d. Top 5 tweets from each topic
2. For the optimal topic:
a. Time series trends in probability of key topics (with all topics in appendix, danny will 
                                                    produce from 1.c output in R)


To dos:
- wrap pipeline in a function that takes in a list of params to iterate over 
- write script to output lists of params for the terminal command 
- Account for breaks due memory constraints being hit/others snags
- add code for computing Gensim coherence measures (Danny)
- add code for outputting coherence, time, parameters, (file names for outputs?) in JSON (Sara)
- add code for outputting document/topic inference, top 5 tweets, wordclouds (Danny)

"""
inDir = os.path.join(ROOT_DIR, INDIR)

dl = sorted(fop.get_files_in_dir(inDir))

tlda = pickle.load(open('data/covid_experiment/num_tops_5_alpha0_0.0001_learning_rate_1e-05_theta_5.005_orthogonality_1000_initialize_first_docs_True_n_eigenvec_20/tlda.obj','rb'))
print("Load Factors")
tlda.unwhitened_factors_= tlda._unwhiten_factors()
tlda_array = tl.to_numpy(tlda.unwhitened_factors_)
print(tlda_array.shape)
#print(tlda.unwhitened_factors_)
print(tlda.n_documents)
print(tl.sum(tlda.mean*tlda.n_documents))
#tlda_array.tofile('tlda.csv', sep = ',')
np.savetxt("tlda.csv", tlda_array, delimiter=",")

